Concrete Stregth Prediction

In [97]:
import warnings
warnings.filterwarnings('ignore')
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
plt.style.use('ggplot')
import seaborn as sns
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# Import Linear Regression machine learning library
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

Cement (component 1) -- quantitative -- kg in a m3 mixture -- Input Variable

Blast Furnace Slag (component 2) -- quantitative -- kg in a m3 mixture -- Input Variable

Fly Ash (component 3) -- quantitative -- kg in a m3 mixture -- Input Variable

Water (component 4) -- quantitative -- kg in a m3 mixture -- Input Variable

Superplasticizer (component 5) -- quantitative -- kg in a m3 mixture -- Input Variable

Coarse Aggregate (component 6) -- quantitative -- kg in a m3 mixture -- Input Variable

Fine Aggregate (component 7) -- quantitative -- kg in a m3 mixture -- Input Variable

Age -- quantitative -- Day (1~365) -- Input Variable

Concrete compressive strength -- quantitative -- MPa -- Output Variable

In [98]:
# importing data
df = pd.read_csv("concrete.csv")
df.head()      # used to see top 5 rows of the dataset
Out[98]:
cement slag ash water superplastic coarseagg fineagg age strength
0 141.3 212.0 0.0 203.5 0.0 971.8 748.5 28 29.89
1 168.9 42.2 124.3 158.3 10.8 1080.8 796.2 14 23.51
2 250.0 0.0 95.7 187.4 5.5 956.9 861.2 28 29.22
3 266.0 114.0 0.0 228.0 0.0 932.0 670.0 28 45.85
4 154.8 183.4 0.0 193.3 9.1 1047.4 696.7 28 18.29

1. Univariate analysis (10 )

In [99]:
#five point analysis
df.describe().transpose()
Out[99]:
count mean std min 25% 50% 75% max
cement 1030.0 281.167864 104.506364 102.00 192.375 272.900 350.000 540.0
slag 1030.0 73.895825 86.279342 0.00 0.000 22.000 142.950 359.4
ash 1030.0 54.188350 63.997004 0.00 0.000 0.000 118.300 200.1
water 1030.0 181.567282 21.354219 121.80 164.900 185.000 192.000 247.0
superplastic 1030.0 6.204660 5.973841 0.00 0.000 6.400 10.200 32.2
coarseagg 1030.0 972.918932 77.753954 801.00 932.000 968.000 1029.400 1145.0
fineagg 1030.0 773.580485 80.175980 594.00 730.950 779.500 824.000 992.6
age 1030.0 45.662136 63.169912 1.00 7.000 28.000 56.000 365.0
strength 1030.0 35.817961 16.705742 2.33 23.710 34.445 46.135 82.6
In [100]:
#checking for datatypes
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   cement        1030 non-null   float64
 1   slag          1030 non-null   float64
 2   ash           1030 non-null   float64
 3   water         1030 non-null   float64
 4   superplastic  1030 non-null   float64
 5   coarseagg     1030 non-null   float64
 6   fineagg       1030 non-null   float64
 7   age           1030 non-null   int64  
 8   strength      1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB
In [101]:
#no of rows and columns
df.shape
Out[101]:
(1030, 9)
In [102]:
#checking for null values
df.isnull().values.any()
Out[102]:
False
In [103]:
#checking for any skewed values
df.skew()
Out[103]:
cement          0.509481
slag            0.800717
ash             0.537354
water           0.074628
superplastic    0.907203
coarseagg      -0.040220
fineagg        -0.253010
age             3.269177
strength        0.416977
dtype: float64
In [104]:
#visualizing age skewness using a box plot
sns.boxplot(df['age'])
Out[104]:
<matplotlib.axes._subplots.AxesSubplot at 0x2185c3a6ac8>
In [105]:
sns.distplot(df['age'])
# Age value less than 365 is allowed since the age value in this can range (1~365)
Out[105]:
<matplotlib.axes._subplots.AxesSubplot at 0x2185c3d2b88>
In [106]:
sns.distplot(df['fineagg'])
Out[106]:
<matplotlib.axes._subplots.AxesSubplot at 0x2185c4e6688>
In [107]:
sns.distplot(df['coarseagg'])
Out[107]:
<matplotlib.axes._subplots.AxesSubplot at 0x2185c587988>
In [108]:
sns.distplot(df['superplastic'])
Out[108]:
<matplotlib.axes._subplots.AxesSubplot at 0x2185c610088>
In [109]:
sns.distplot(df['water'])
Out[109]:
<matplotlib.axes._subplots.AxesSubplot at 0x2185c6c6688>
In [110]:
sns.distplot(df['ash'])
Out[110]:
<matplotlib.axes._subplots.AxesSubplot at 0x2185c772d48>
In [111]:
sns.distplot(df['slag'])
Out[111]:
<matplotlib.axes._subplots.AxesSubplot at 0x2185c7b9448>
In [112]:
sns.distplot(df['cement'])
Out[112]:
<matplotlib.axes._subplots.AxesSubplot at 0x2185c88d888>
In [113]:
sns.distplot(df['strength'], kde= True)
Out[113]:
<matplotlib.axes._subplots.AxesSubplot at 0x2185c8fda88>

2.Bivariate Analysis (10 )

In [114]:
sns.pairplot(df)
#From the pariplot as cement increases strength also increase
Out[114]:
<seaborn.axisgrid.PairGrid at 0x2185c8f8288>
In [115]:
# visualise area-price relationship
sns.regplot(x="cement", y="strength", data=df, fit_reg=False)
Out[115]:
<matplotlib.axes._subplots.AxesSubplot at 0x2185f60b2c8>
In [116]:
sns.regplot(x="slag", y="strength", data=df, fit_reg=False)
Out[116]:
<matplotlib.axes._subplots.AxesSubplot at 0x21861191888>
In [117]:
sns.regplot(x="ash", y="strength", data=df, fit_reg=False)
Out[117]:
<matplotlib.axes._subplots.AxesSubplot at 0x218611cd888>
In [118]:
sns.regplot(x="water", y="strength", data=df, fit_reg=False)
Out[118]:
<matplotlib.axes._subplots.AxesSubplot at 0x21861cd7f08>
In [119]:
sns.regplot(x="superplastic", y="strength", data=df, fit_reg=False)
Out[119]:
<matplotlib.axes._subplots.AxesSubplot at 0x2185c3dc288>
In [120]:
sns.regplot(x="coarseagg", y="strength", data=df, fit_reg=False)
Out[120]:
<matplotlib.axes._subplots.AxesSubplot at 0x2185c36b388>
In [121]:
sns.regplot(x="fineagg", y="strength", data=df, fit_reg=False)
Out[121]:
<matplotlib.axes._subplots.AxesSubplot at 0x21861dd8e08>
In [122]:
sns.regplot(x="age", y="strength", data=df, fit_reg=False)
Out[122]:
<matplotlib.axes._subplots.AxesSubplot at 0x21861e49a88>

3. Feature engineering techniques (10 )

In [123]:
#Checking for highly corelaed variables
df.corr()
#no highly correlated independent variables so planning to keep all varibales as it is.
Out[123]:
cement slag ash water superplastic coarseagg fineagg age strength
cement 1.000000 -0.275216 -0.397467 -0.081587 0.092386 -0.109349 -0.222718 0.081946 0.497832
slag -0.275216 1.000000 -0.323580 0.107252 0.043270 -0.283999 -0.281603 -0.044246 0.134829
ash -0.397467 -0.323580 1.000000 -0.256984 0.377503 -0.009961 0.079108 -0.154371 -0.105755
water -0.081587 0.107252 -0.256984 1.000000 -0.657533 -0.182294 -0.450661 0.277618 -0.289633
superplastic 0.092386 0.043270 0.377503 -0.657533 1.000000 -0.265999 0.222691 -0.192700 0.366079
coarseagg -0.109349 -0.283999 -0.009961 -0.182294 -0.265999 1.000000 -0.178481 -0.003016 -0.164935
fineagg -0.222718 -0.281603 0.079108 -0.450661 0.222691 -0.178481 1.000000 -0.156095 -0.167241
age 0.081946 -0.044246 -0.154371 0.277618 -0.192700 -0.003016 -0.156095 1.000000 0.328873
strength 0.497832 0.134829 -0.105755 -0.289633 0.366079 -0.164935 -0.167241 0.328873 1.000000
In [124]:
plt.figure(figsize=(10,8))

sns.heatmap(df.corr(),
            annot=True,
            linewidths=.5,
            center=0,
            cbar=False)

plt.show()
In [125]:
# Copy all the predictor variables into X dataframe. Since 'strength' is dependent variable drop it
X = df.drop('strength', axis=1)

# Copy the 'strength' column alone into the y dataframe. This is the dependent variable
y = df[['strength']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)
In [126]:
#Using polynomial Features to create more independent variables 
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree = 2, interaction_only=True)
X_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.30, random_state=1)

4 .Creating the model and tuning it (30 )

In [127]:
#Linear Regression
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
regression_model.score(X_test, y_test)
Out[127]:
0.7444710081439875
In [128]:
#Ridge
ridge = Ridge(alpha=.3)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))
Ridge model: [[ 0.00000000e+00  3.34737287e-01 -8.90485076e-02 -6.30434183e-02
   1.32035147e+00 -5.86750605e+00 -1.02711861e-01 -2.79869108e-01
  -1.60696328e-01  8.73200298e-05  1.77720733e-04 -1.76103896e-03
  -3.06993810e-03  4.03666779e-05  7.44596754e-05  4.39670032e-04
   3.25685432e-04 -8.96567438e-04  6.94464692e-04  5.50193577e-05
   3.02286525e-04  7.80669809e-04 -1.70452413e-03 -6.47920159e-03
   5.66003664e-05  3.90510372e-04  1.67035513e-03  1.45558038e-02
  -6.68676049e-04 -1.90665156e-04 -7.25230464e-04  3.55321606e-03
   1.74081215e-03  6.37347701e-03  2.61315740e-04 -6.42094713e-05
   3.62849389e-04]]
In [129]:
#Lasso ; Just seeing if we can drop any independent variable.
lasso = Lasso(alpha=0.1)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))
Lasso model: [ 0.00000000e+00  2.05754116e-01 -3.87982396e-02 -4.35284753e-01
  1.23746835e+00  1.33809450e-01 -5.29179020e-02 -1.51825356e-01
 -2.78460812e-01  8.95660513e-05  2.60779047e-04 -1.33378929e-03
 -3.95450528e-03  7.18779238e-05  1.00597102e-04  3.72222764e-04
  3.88802403e-04 -8.09034433e-04 -1.02471621e-03  2.93639011e-05
  2.50952391e-04  7.37821730e-04 -9.66548234e-04 -7.33838283e-03
  2.20040160e-04  4.67341433e-04  1.55572176e-03  9.50087061e-03
 -6.25908111e-04 -3.76717225e-04 -3.77843298e-04  1.15780298e-03
 -1.45978046e-03  8.20078420e-03  1.70095680e-04  3.36183515e-05
  3.31263963e-04]
In [130]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))
0.7549507401174544
0.744826902205591
In [131]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))
#The test scores are comparable between simple and quadratic models. Scores are coming around 60%. We have to try other algorithms 
0.7525566410009125
0.7479043301769873
In [132]:
dtRegressor = DecisionTreeRegressor(random_state=0)
dtRegressor.fit(X_train,y_train)
print(dtRegressor.score(X_train, y_train))
print(dtRegressor.score(X_test, y_test))
#Decsion tree regressor is giving the required scores before applying cross validation
0.9948592423407845
0.8250944096509519
In [133]:
randomForestRegressor = RandomForestRegressor(max_depth=2, random_state=0)
randomForestRegressor.fit(X_train,y_train)
print(randomForestRegressor.score(X_train, y_train))
print(randomForestRegressor.score(X_test, y_test))
0.6769159140478779
0.6310812179568712
In [134]:
from sklearn.ensemble import AdaBoostRegressor
abRegressor = AdaBoostRegressor(random_state=0, n_estimators=100)
abRegressor.fit(X_train,y_train)
print(abRegressor.score(X_train, y_train))
print(abRegressor.score(X_test, y_test))
0.8615854235930094
0.8157435134345754
In [135]:
from sklearn.ensemble import GradientBoostingRegressor
gradientRegressor = GradientBoostingRegressor(random_state=0)
gradientRegressor.fit(X_train,y_train)
print(gradientRegressor.score(X_train, y_train))
print(gradientRegressor.score(X_test, y_test))
0.9633315530162665
0.9032674789608472

So far we had got good results in DecisionTreeRegressor and Gradient Boosting Regressor ~82% and ~96% on training.

We will take DecisionTreeRegressor and Gradient Boosting Regressor for hyperparameter tuning and cross validation.

We will find out the best combination of hyperparameters and then do the KFold crossvalidation on those before concluding on the result

DecisionTree Regressor

In [136]:
dtRegressor.get_params().keys()
Out[136]:
dict_keys(['ccp_alpha', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'presort', 'random_state', 'splitter'])
In [137]:
#We will try to use RandomSearchCV 
samples = 10  # number of random samples 
depths = np.arange(1, 20)
num_leafs = [1, 5, 10, 20]
parameters = [{'max_depth':depths,
              'min_samples_leaf':num_leafs}]
dtRegressorRSCV = RandomizedSearchCV(dtRegressor, parameters, cv=samples)
dtRegressorRSCV.fit(X, y)
print(dtRegressorRSCV.best_params_)
{'min_samples_leaf': 1, 'max_depth': 17}
In [138]:
num_folds = 3
seed = 7

kfold = KFold(n_splits=num_folds, random_state=seed)
results = cross_val_score(dtRegressorRSCV, X, y, cv=kfold, scoring='r2')
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
[0.79637042 0.83790681 0.85108882]
Accuracy: 82.846% (2.332%)
In [139]:
#We will try to use GridSearchCV now
samples = 10  # number of random samples 
depths = np.arange(1, 20)
num_leafs = [1, 5, 10, 20]
parameters = {    'min_samples_leaf'    : num_leafs,
                  'max_depth'    : depths
             }
dtRegressorGSCV = GridSearchCV(dtRegressor, param_grid = parameters, scoring='r2', cv=samples)
dtRegressorGSCV.fit(X, y)
print(dtRegressorGSCV.best_params_)
{'max_depth': 15, 'min_samples_leaf': 1}
In [140]:
num_folds = 3
seed = 7

kfold = KFold(n_splits=num_folds, random_state=seed)
results = cross_val_score(dtRegressorGSCV, X, y, cv=kfold, scoring='r2')
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
[0.79637042 0.84750034 0.84729649]
Accuracy: 83.039% (2.405%)

Gradient Regressor

In [141]:
gradientRegressor.get_params().keys()
Out[141]:
dict_keys(['alpha', 'ccp_alpha', 'criterion', 'init', 'learning_rate', 'loss', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_iter_no_change', 'presort', 'random_state', 'subsample', 'tol', 'validation_fraction', 'verbose', 'warm_start'])
In [142]:
#We will try to use RandomSearchCV 
samples = 10  # number of random samples 
depths = np.arange(1, 20)
num_leafs = [ 20, 30 ]
parameters = [{'max_depth':depths,
              'min_samples_leaf':num_leafs}]
gradientRegressorRSCV = RandomizedSearchCV(gradientRegressor, parameters, cv=samples)
gradientRegressorRSCV.fit(X, y)
print(gradientRegressorRSCV.best_params_)
{'min_samples_leaf': 20, 'max_depth': 14}
In [143]:
num_folds = 3
seed = 7

kfold = KFold(n_splits=num_folds, random_state=seed)
results = cross_val_score(gradientRegressorRSCV, X, y, cv=kfold, scoring='r2')
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
[0.91283625 0.91428678 0.90510598]
Accuracy: 91.074% (0.403%)
In [144]:
#We will try to use GridSearchCV 
samples = 10  # number of random samples 
depths = np.arange(1, 20)
num_leafs = [ 20 , 30]
parameters = {    'max_depth'    : depths,
                  'min_samples_leaf': num_leafs
             }
gradientRegressorGSCV = GridSearchCV(gradientRegressor, param_grid = parameters, scoring='r2', cv=samples)
gradientRegressorGSCV.fit(X, y)
print(gradientRegressorGSCV.best_params_)
{'max_depth': 16, 'min_samples_leaf': 20}
In [145]:
num_folds = 3
seed = 7

kfold = KFold(n_splits=num_folds, random_state=seed)
results = cross_val_score(gradientRegressorGSCV, X, y, cv=kfold, scoring='r2')
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
[0.91306071 0.91559105 0.90228687]
Accuracy: 91.031% (0.577%)

We can choose GradientBoost Regressor or DecisionTree regressor to get a r2 score above 80%

#### folds size reduced for faster execution. I had tried with larger folds size and got comparable values for cross validation.